{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "d:\\softwares\\python\\lib\\site-packages\\umap\\distances.py:1063: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
            "  @numba.jit()\n",
            "d:\\softwares\\python\\lib\\site-packages\\umap\\distances.py:1071: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
            "  @numba.jit()\n",
            "d:\\softwares\\python\\lib\\site-packages\\umap\\distances.py:1086: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
            "  @numba.jit()\n",
            "d:\\softwares\\python\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
            "  from .autonotebook import tqdm as notebook_tqdm\n",
            "d:\\softwares\\python\\lib\\site-packages\\umap\\umap_.py:660: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
            "  @numba.jit()\n"
          ]
        }
      ],
      "source": [
        "import numpy as np\n",
        "from bertopic import BERTopic\n",
        "from sentence_transformers import SentenceTransformer\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "from transformers.pipelines import pipeline\n",
        "from umap import UMAP\n",
        "from hdbscan import HDBSCAN\n",
        "from sklearn.feature_extraction.text import CountVectorizer"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# 加载数据"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "条数:  1000\n",
            "预览第一条:  文旅文 创看 洛阳 河南省 文旅文创 发展 大会 本次 大会 安排 项目 签约 主要 方面 内容 一是 文旅 产业 项目 签约 截至 目前 梳理 重点 文旅 项目 投资总额 525.6 亿元 遴选 重大项目 进行 现场 签约 投资总额 365.8 亿元 项目 包括 文物 数字化 开发 文化 创意 园区 建设 文化 项目 涵盖 旅游 度假区 建设 旅游 酒店 民宿 打造 旅游 项目 既有 旅游 景区 开发 商旅 综合体 建设 传统 业态 项目 宇宙 基地 沉浸 演艺 业态 项目 充分体现 我省 文化 旅游 发展 特点 趋势 二是 引客 入豫 项目 签约 主要 我省 文旅 部门 文旅 企业 头部 旅行 知名 OTA 平台 重点 客源地 文旅 部门 签订 引客 入豫 协议 持续 拓展 省外 客源 市场\n",
            "\n"
          ]
        }
      ],
      "source": [
        "# step1 加载文件\n",
        "with open('../../data/切词.txt', 'r', encoding='utf-8') as file:\n",
        "  docs = file.readlines()\n",
        "print('条数: ', len(docs))\n",
        "print('预览第一条: ', docs[0])\n",
        "\n",
        "vectorizer_model = None"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# 创建"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']\n",
            "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
            "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "(1000, 768)\n"
          ]
        }
      ],
      "source": [
        "# 1. 词向量模型，同时加载本地训练好的词向量\n",
        "embedding_model = pipeline(\"feature-extraction\", model=\"bert-base-chinese\") # 使用bert-base-chinese\n",
        "embeddings = np.load('../../data/embedding_bbc.npy') # 使用bert-base-chinese向量\n",
        "print(embeddings.shape)\n",
        "\n",
        "# 2. 创建分词模型\n",
        "vectorizer_model = CountVectorizer() # 因为我们已经分好词了，所以这里不需要传入分词函数了\n",
        "\n",
        "# 3. 创建UMAP降维模型\n",
        "umap_model = UMAP(\n",
        "  n_neighbors=15,\n",
        "  n_components=5,\n",
        "  min_dist=0.0,\n",
        "  metric='cosine',\n",
        "  random_state=42  # ⚠️ 防止随机 https://maartengr.github.io/BERTopic/faq.html\n",
        ")\n",
        "\n",
        "# 4. 创建HDBSCAN聚类模型\n",
        "# 如果要建设离群值，可以减小下面两个参数\n",
        "# https://hdbscan.readthedocs.io/en/latest/faq.html\n",
        "hdbscan_model = HDBSCAN(\n",
        "  min_cluster_size=20,\n",
        "  min_samples=5,\n",
        "  prediction_data=True # 一定要注意，如果后文设置了calculate_probabilities = True，一定要设置该参数 https://github.com/MaartenGr/BERTopic/issues/1103\n",
        ")\n",
        "\n",
        "# 5. 创建CountVectorizer模型\n",
        "vectorizer_model = CountVectorizer(stop_words=['洛阳', '旅游', '文化'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Topic</th>\n",
              "      <th>Count</th>\n",
              "      <th>Name</th>\n",
              "      <th>Representation</th>\n",
              "      <th>Representative_Docs</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>-1</td>\n",
              "      <td>247</td>\n",
              "      <td>-1_景区_城市_河南_中国</td>\n",
              "      <td>[景区, 城市, 河南, 中国, 游客, 历史, 洛阳市, 遗址, 博物馆, 发展]</td>\n",
              "      <td>[河南 多家 景区 陆续 发布 开园 公告 台风 杜苏芮 强度 逐渐 减弱 河南 景区 陆续...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>0</td>\n",
              "      <td>162</td>\n",
              "      <td>0_景区_发展_项目_建设</td>\n",
              "      <td>[景区, 发展, 项目, 建设, 国家, 河南省, 活动, 河南, 工作, 洛阳市]</td>\n",
              "      <td>[行走 河南 读懂 中国 关注 全省 文旅文创 发展 大会 二十大 报告 指出 坚持 以文塑...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1</td>\n",
              "      <td>98</td>\n",
              "      <td>1_活动_景区_免费_高速</td>\n",
              "      <td>[活动, 景区, 免费, 高速, 门票, 时间, 白云山, 地点, 栾川, 安全可靠]</td>\n",
              "      <td>[洛阳 身边 自驾游 栾川 高速 免费 答疑 自驾游 栾川 高速 费全免 问题 需要 了解 ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>2</td>\n",
              "      <td>81</td>\n",
              "      <td>2_石窟_中国_龙门石窟_艺术</td>\n",
              "      <td>[石窟, 中国, 龙门石窟, 艺术, 世界, 莫高窟, 文化遗产, 朝代, 造像, 少林寺]</td>\n",
              "      <td>[旅行 洛阳 龙门石窟 中国 石刻 艺术 宝库 现为 世界 文化遗产 全国 重点 文物保护 ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>3</td>\n",
              "      <td>66</td>\n",
              "      <td>3_王府_竹海_河南_旅行</td>\n",
              "      <td>[王府, 竹海, 河南, 旅行, 景区, 安阳, 瀑布, 旅游区, 栾川, 度假]</td>\n",
              "      <td>[每年 中国 旅游 一天 选择 出游 可能 今年 旅行 计划 实现 一定 提前 规划 今天 ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>4</td>\n",
              "      <td>52</td>\n",
              "      <td>4_晚会_广场_活动_文旅</td>\n",
              "      <td>[晚会, 广场, 活动, 文旅, 体验, 话题, 河南, 历史, 沉浸, 隋唐洛阳城]</td>\n",
              "      <td>[中国 旅游 泉州 举行 多项 文旅 活动 鲤城区 城南 庙会 再次 开启 传统工艺 传统 ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>5</td>\n",
              "      <td>44</td>\n",
              "      <td>5_建设_机场_项目_规划</td>\n",
              "      <td>[建设, 机场, 项目, 规划, 铁路, 自驾车, 黄河, 发展, 国家, 航线]</td>\n",
              "      <td>[国家 发展 改革 印发 汉江 生态 经济带 发展 规划 通知 汉江 生态 经济带 规划 范...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>6</td>\n",
              "      <td>42</td>\n",
              "      <td>6_地方_很多_一天_一点</td>\n",
              "      <td>[地方, 很多, 一天, 一点, 已经, 时间, 古城, 酒店, 西安, 感受]</td>\n",
              "      <td>[洛阳 值得 一去 地方 简单 下来 洛阳 旅游 攻略 建议 夏天 洛阳 洛阳 市区 主要 ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8</th>\n",
              "      <td>7</td>\n",
              "      <td>42</td>\n",
              "      <td>7_泉州_旅行_洛阳桥_历史</td>\n",
              "      <td>[泉州, 旅行, 洛阳桥, 历史, 济南, 中国, 古城, 沧州, 黄陂, 洛邑]</td>\n",
              "      <td>[海丝 泉州 夏天 清晨 洛阳桥 洛阳 日出 夏天 清晨 来到 泉州 洛阳桥 沐浴 晨光 长...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9</th>\n",
              "      <td>8</td>\n",
              "      <td>41</td>\n",
              "      <td>8_旅客_年票_郑州_客流</td>\n",
              "      <td>[旅客, 年票, 郑州, 客流, 列车, 出行, 客运, 高速, 发送, 景区]</td>\n",
              "      <td>[今天 郑州 铁路 预计 发送 旅客 51.7 万人 管内 客流量 激增 中国 铁路 郑州 ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>10</th>\n",
              "      <td>9</td>\n",
              "      <td>27</td>\n",
              "      <td>9_防控_疫情_景区_开放</td>\n",
              "      <td>[防控, 疫情, 景区, 开放, 场所, 关闭, 恢复, 人员, 娱乐场所, 落实]</td>\n",
              "      <td>[洛阳 即日起 洛阳 暂时 关闭 网吧 KTV 人员 聚集 密闭 场所 小编 刚刚 洛阳市 ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>11</th>\n",
              "      <td>10</td>\n",
              "      <td>27</td>\n",
              "      <td>10_游客_景区_老君山_午餐</td>\n",
              "      <td>[游客, 景区, 老君山, 午餐, 表示, 同学, 孙军范, 卷入, 河南, 工作人员]</td>\n",
              "      <td>[无人 值守 一元 午餐 结账 多出 国庆 假期 景区 连续 推出 午餐 共售 游客 点赞 ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>12</th>\n",
              "      <td>11</td>\n",
              "      <td>25</td>\n",
              "      <td>11_河南_巩义_江南_中国</td>\n",
              "      <td>[河南, 巩义, 江南, 中国, 故城, 历史, 善良, 考古队, 顾荣, 范仲淹]</td>\n",
              "      <td>[文化 自然遗产 专属 书单 每年 第二个 星期六 我国 文化 自然遗产 活动 主题 多次 ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>13</th>\n",
              "      <td>12</td>\n",
              "      <td>25</td>\n",
              "      <td>12_洞中_广济桥_建筑_鸡冠</td>\n",
              "      <td>[洞中, 广济桥, 建筑, 鸡冠, 白马寺, 周王城, 始建, 潮州, 大理, 公元]</td>\n",
              "      <td>[车载 民生 行天下 河南 旅游 行天下 汽车票 旅游 风景 智慧 购票 便民 出行 河南 ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>14</th>\n",
              "      <td>13</td>\n",
              "      <td>21</td>\n",
              "      <td>13_增长_五一_数据_预订</td>\n",
              "      <td>[增长, 五一, 数据, 预订, 热门, 假期, 上海, 显示, 国内, 同期]</td>\n",
              "      <td>[携程 联合 新华 财经 发布 五一 旅行 数据 报告 报告 显示 北京 热门 旅游 城市 ...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "    Topic  Count             Name  \\\n",
              "0      -1    247   -1_景区_城市_河南_中国   \n",
              "1       0    162    0_景区_发展_项目_建设   \n",
              "2       1     98    1_活动_景区_免费_高速   \n",
              "3       2     81  2_石窟_中国_龙门石窟_艺术   \n",
              "4       3     66    3_王府_竹海_河南_旅行   \n",
              "5       4     52    4_晚会_广场_活动_文旅   \n",
              "6       5     44    5_建设_机场_项目_规划   \n",
              "7       6     42    6_地方_很多_一天_一点   \n",
              "8       7     42   7_泉州_旅行_洛阳桥_历史   \n",
              "9       8     41    8_旅客_年票_郑州_客流   \n",
              "10      9     27    9_防控_疫情_景区_开放   \n",
              "11     10     27  10_游客_景区_老君山_午餐   \n",
              "12     11     25   11_河南_巩义_江南_中国   \n",
              "13     12     25  12_洞中_广济桥_建筑_鸡冠   \n",
              "14     13     21   13_增长_五一_数据_预订   \n",
              "\n",
              "                                    Representation  \\\n",
              "0       [景区, 城市, 河南, 中国, 游客, 历史, 洛阳市, 遗址, 博物馆, 发展]   \n",
              "1       [景区, 发展, 项目, 建设, 国家, 河南省, 活动, 河南, 工作, 洛阳市]   \n",
              "2      [活动, 景区, 免费, 高速, 门票, 时间, 白云山, 地点, 栾川, 安全可靠]   \n",
              "3   [石窟, 中国, 龙门石窟, 艺术, 世界, 莫高窟, 文化遗产, 朝代, 造像, 少林寺]   \n",
              "4        [王府, 竹海, 河南, 旅行, 景区, 安阳, 瀑布, 旅游区, 栾川, 度假]   \n",
              "5      [晚会, 广场, 活动, 文旅, 体验, 话题, 河南, 历史, 沉浸, 隋唐洛阳城]   \n",
              "6        [建设, 机场, 项目, 规划, 铁路, 自驾车, 黄河, 发展, 国家, 航线]   \n",
              "7         [地方, 很多, 一天, 一点, 已经, 时间, 古城, 酒店, 西安, 感受]   \n",
              "8        [泉州, 旅行, 洛阳桥, 历史, 济南, 中国, 古城, 沧州, 黄陂, 洛邑]   \n",
              "9         [旅客, 年票, 郑州, 客流, 列车, 出行, 客运, 高速, 发送, 景区]   \n",
              "10      [防控, 疫情, 景区, 开放, 场所, 关闭, 恢复, 人员, 娱乐场所, 落实]   \n",
              "11    [游客, 景区, 老君山, 午餐, 表示, 同学, 孙军范, 卷入, 河南, 工作人员]   \n",
              "12      [河南, 巩义, 江南, 中国, 故城, 历史, 善良, 考古队, 顾荣, 范仲淹]   \n",
              "13     [洞中, 广济桥, 建筑, 鸡冠, 白马寺, 周王城, 始建, 潮州, 大理, 公元]   \n",
              "14        [增长, 五一, 数据, 预订, 热门, 假期, 上海, 显示, 国内, 同期]   \n",
              "\n",
              "                                  Representative_Docs  \n",
              "0   [河南 多家 景区 陆续 发布 开园 公告 台风 杜苏芮 强度 逐渐 减弱 河南 景区 陆续...  \n",
              "1   [行走 河南 读懂 中国 关注 全省 文旅文创 发展 大会 二十大 报告 指出 坚持 以文塑...  \n",
              "2   [洛阳 身边 自驾游 栾川 高速 免费 答疑 自驾游 栾川 高速 费全免 问题 需要 了解 ...  \n",
              "3   [旅行 洛阳 龙门石窟 中国 石刻 艺术 宝库 现为 世界 文化遗产 全国 重点 文物保护 ...  \n",
              "4   [每年 中国 旅游 一天 选择 出游 可能 今年 旅行 计划 实现 一定 提前 规划 今天 ...  \n",
              "5   [中国 旅游 泉州 举行 多项 文旅 活动 鲤城区 城南 庙会 再次 开启 传统工艺 传统 ...  \n",
              "6   [国家 发展 改革 印发 汉江 生态 经济带 发展 规划 通知 汉江 生态 经济带 规划 范...  \n",
              "7   [洛阳 值得 一去 地方 简单 下来 洛阳 旅游 攻略 建议 夏天 洛阳 洛阳 市区 主要 ...  \n",
              "8   [海丝 泉州 夏天 清晨 洛阳桥 洛阳 日出 夏天 清晨 来到 泉州 洛阳桥 沐浴 晨光 长...  \n",
              "9   [今天 郑州 铁路 预计 发送 旅客 51.7 万人 管内 客流量 激增 中国 铁路 郑州 ...  \n",
              "10  [洛阳 即日起 洛阳 暂时 关闭 网吧 KTV 人员 聚集 密闭 场所 小编 刚刚 洛阳市 ...  \n",
              "11  [无人 值守 一元 午餐 结账 多出 国庆 假期 景区 连续 推出 午餐 共售 游客 点赞 ...  \n",
              "12  [文化 自然遗产 专属 书单 每年 第二个 星期六 我国 文化 自然遗产 活动 主题 多次 ...  \n",
              "13  [车载 民生 行天下 河南 旅游 行天下 汽车票 旅游 风景 智慧 购票 便民 出行 河南 ...  \n",
              "14  [携程 联合 新华 财经 发布 五一 旅行 数据 报告 报告 显示 北京 热门 旅游 城市 ...  "
            ]
          },
          "execution_count": 4,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "topic_model = BERTopic(\n",
        "  embedding_model=embedding_model,\n",
        "  vectorizer_model=vectorizer_model,\n",
        "  umap_model=umap_model,\n",
        "  hdbscan_model=hdbscan_model,\n",
        "  # 官方文档\n",
        "  # The calculate_probabilties parameter is only used when using HDBSCAN or cuML's HDBSCAN model. In other words, this will not work when using a model other than HDBSCAN\n",
        "\n",
        "  # 源码注释\n",
        "  # https://maartengr.github.io/BERTopic/faq.html#how-do-i-calculate-the-probabilities-of-all-topics-in-a-document\n",
        "  # calculate_probabilities: Calculate the probabilities of all topics\n",
        "  #                        per document instead of the probability of the assigned\n",
        "  #                        topic per document. This could slow down the extraction\n",
        "  #                        of topics if you have many documents (> 100_000). \n",
        "  #                        NOTE: If false you cannot use the corresponding\n",
        "  #                        visualization method `visualize_probabilities`.\n",
        "  #                        NOTE: This is an approximation of topic probabilities\n",
        "  #                        as used in HDBSCAN and not an exact representation.\n",
        "  # 简而言之，这个值默认是False\n",
        "  # 它的含义是：如果设置为False（默认值），则只计算一条文档归属于其所属主题的概率；如果设置为True则计算一条文档归属于所有主题的概率。但可能会降低运算效率\n",
        "  # 其次，该参数仅用于使用HDBSCAN做聚类的场景\n",
        "  calculate_probabilities = True\n",
        ")\n",
        "\n",
        "# 源码注释\n",
        "# predictions: Topic predictions for each documents\n",
        "# probabilities: The probability of the assigned topic per document.\n",
        "#   If `calculate_probabilities` in BERTopic is set to True, then\n",
        "#   it calculates the probabilities of all topics across all documents\n",
        "#   instead of only the assigned topic. This, however, slows down\n",
        "#   computation and may increase memory usage.\n",
        "topics, probs = topic_model.fit_transform(docs, embeddings=embeddings) #传入训练好的词向量\n",
        "topic_info = topic_model.get_topic_info()\n",
        "topic_info"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# 先看一个topic_model.fit_transform输出的是什么"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "1000 [0, 4, -1, 11, 3, -1, 1, 6, 0, 6]\n",
            "1000 [[1.00000000e+000 2.24152299e-308 1.45906468e-308 2.16219967e-308\n",
            "  4.40626796e-308 6.17629056e-308 1.08959420e-308 1.78254211e-308\n",
            "  4.02618017e-308 3.67314470e-308 3.30194792e-308 2.06018860e-308\n",
            "  1.67322457e-308 7.51970417e-308]\n",
            " [8.79226977e-002 7.34716007e-002 2.57652175e-002 5.11323156e-002\n",
            "  3.48841499e-001 5.64940158e-002 2.26255714e-002 4.74269973e-002\n",
            "  3.59713596e-002 2.98453504e-002 3.16983059e-002 5.05101492e-002\n",
            "  3.49719677e-002 4.83085972e-002]\n",
            " [8.82620464e-002 4.79825392e-002 2.10665396e-002 3.72169570e-002\n",
            "  1.61851684e-001 4.56143031e-002 1.72399318e-002 3.46996267e-002\n",
            "  3.15045457e-002 2.67892338e-002 3.05467141e-002 4.04751835e-002\n",
            "  2.72501256e-002 4.75789515e-002]\n",
            " [2.98133626e-002 4.21757537e-002 5.09420560e-002 1.00959749e-001\n",
            "  4.33859199e-002 2.48338536e-002 4.09437812e-002 9.61033272e-002\n",
            "  1.82480207e-002 1.65999292e-002 1.76714829e-002 2.76230080e-001\n",
            "  1.40729141e-001 2.26467243e-002]\n",
            " [3.80705594e-002 4.46445854e-002 4.02329075e-002 3.82160077e-001\n",
            "  5.60264451e-002 3.32146392e-002 3.02016614e-002 8.05647087e-002\n",
            "  2.16761606e-002 1.88906009e-002 1.89653892e-002 9.59534803e-002\n",
            "  9.96785174e-002 2.62392392e-002]\n",
            " [3.70138205e-002 3.78764209e-002 1.01747419e-001 1.26840206e-001\n",
            "  4.32703580e-002 3.41646334e-002 3.21511385e-002 6.50424295e-002\n",
            "  2.32280550e-002 2.12040354e-002 2.13038919e-002 1.01725815e-001\n",
            "  1.83129284e-001 2.86953060e-002]\n",
            " [3.17736981e-002 3.64547153e-001 2.17057187e-002 5.40524601e-002\n",
            "  1.02387670e-001 2.90892337e-002 2.88663456e-002 6.66821634e-002\n",
            "  2.44504399e-002 2.05387660e-002 2.09117663e-002 3.74766449e-002\n",
            "  3.24416254e-002 2.57953584e-002]\n",
            " [1.20006179e-308 2.50555263e-308 1.65644326e-308 4.04346807e-308\n",
            "  1.78526606e-308 1.07555309e-308 1.00000000e+000 3.80299548e-308\n",
            "  9.05252931e-309 8.24375783e-309 9.10265504e-309 2.33537395e-308\n",
            "  2.59222516e-308 9.95006573e-309]\n",
            " [1.00000000e+000 2.35846330e-308 1.50145509e-308 2.27829305e-308\n",
            "  4.94059293e-308 6.83496402e-308 1.12517233e-308 1.86907722e-308\n",
            "  4.00044167e-308 3.55812369e-308 3.26991877e-308 2.15842494e-308\n",
            "  1.73626752e-308 7.80309084e-308]\n",
            " [1.27035022e-002 2.57410136e-002 1.85154839e-002 4.95071893e-002\n",
            "  1.90961971e-002 1.13416946e-002 1.66303822e-001 4.76876277e-002\n",
            "  9.30602319e-003 8.44480989e-003 9.35537831e-003 2.74566129e-002\n",
            "  3.18077904e-002 1.03710922e-002]]\n"
          ]
        }
      ],
      "source": [
        "print(len(topics), topics[:10])\n",
        "print(len(probs), probs[:10])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "(1000, 14)"
            ]
          },
          "execution_count": 6,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "probs.shape"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.6"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}
